CNV Analysis


In [1]:
import cPickle
import datetime
import glob
import os
import random
import re
import subprocess

import cdpybio as cpb
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pybedtools as pbt
import scipy.stats as stats
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import tabix
import vcf as pyvcf

import cardipspy as cpy
import ciepy

%matplotlib inline

dy_name = 'cnv_analysis'

import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
    dy = os.path.join(ciepy.root, 'sandbox', 'tmp', dy_name)
    cpy.makedir(dy)
    pbt.set_tempdir(dy)
    
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)

private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)

In [2]:
transcript_to_gene = pd.read_table(cpy.gencode_transcript_gene, header=None, 
                                   squeeze=True, index_col=0)
gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'eqtl_input', 
                  'tpm_log_filtered_phe_std_norm_peer_resid.tsv')
exp = pd.read_table(fn, index_col=0)

dy = os.path.join(ciepy.root, 'output/eqtl_processing/eqtls01')
fn = os.path.join(dy, 'qvalues.tsv')
qvalues = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'lead_variants.tsv')
lead_vars = pd.read_table(fn, index_col=0)
lead_vars = lead_vars[lead_vars.perm_sig]
fn = os.path.join(dy, 'gene_variant_pairs.tsv')
gene_variant = pd.read_table(fn, index_col=0)

fn = os.path.join(ciepy.root, 'output/cnv_processing/gs_info.pickle')
gs_info = cPickle.load(open(fn))
fn = os.path.join(ciepy.root, 'private_output/cnv_processing/gs_genotypes.tsv')
gs_geno = pd.read_table(fn, index_col=0)

fn = os.path.join(ciepy.root, 'output/cnv_processing/gs_combined_info.pickle')
gs_combined_info = cPickle.load(open(fn))
fn = os.path.join(ciepy.root, 'private_output/cnv_processing/gs_combined_genotypes.tsv')
gs_combined_geno = pd.read_table(fn, index_col=0)

fn = os.path.join(ciepy.root, 'output/cnv_processing/lumpy_info.pickle')
lumpy_info = cPickle.load(open(fn))
fn = os.path.join(ciepy.root, 'private_output/cnv_processing/lumpy_genotypes.tsv')
lumpy_geno = pd.read_table(fn, index_col=0)

fn = os.path.join(ciepy.root, 'output/cnv_processing/combined_info.pickle')
combined_info = cPickle.load(open(fn))
combined_info = combined_info[combined_info.emmax]

gene_bt = pbt.BedTool(cpy.gencode_gene_bed)

# fn = os.path.join(ciepy.root, 'output', 'cnv_processing', 'encode_dnase_overlap.tsv')
# encode_dnase_overlap = pd.read_table(fn, index_col=0)
# fn = os.path.join(ciepy.root, 'output', 'cnv_processing', 'gs_encode_tf_chip_seq_overlap.tsv')
# gs_encode_chip_overlap = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'cnv_processing', 'gs_roadmap_overlap.tsv')
gs_roadmap_overlap = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'cnv_processing', 'lumpy_roadmap_overlap.tsv')
lumpy_roadmap_overlap = pd.read_table(fn, index_col=0)

# fn = os.path.join(ciepy.root, 'private_output', 'cnv_processing', 'emmax_cnvs.tsv')
# emmax_cnvs = pd.read_table(fn, index_col=0)

I want to identify CNVs that are within 1Mb of TSSs for genes we tested. I'll throw out anything that we didn't test at all.


In [3]:
combined_info = combined_info[(combined_info.chrom != 'chrX') & (combined_info.chrom != 'chrY')]

In [4]:
tss = pd.read_table(cpy.gencode_tss_bed, header=None)
tss[3] = tss[3].apply(lambda x: transcript_to_gene[x.split('_')[0]])
tss = tss.merge(qvalues, left_on=3, right_index=True)
tss = tss.drop(['perm_pvalue', 'perm_qvalue', 'perm_sig'], axis=1)
s = '\n'.join(tss.astype(str).apply(lambda x: '\t'.join(x), axis=1)) + '\n'
tss_bt = pbt.BedTool(s, from_string=True)
tss_bt = tss_bt.sort()

In [5]:
s = '\n'.join(combined_info.chrom + '\t' + combined_info.start.astype(str) + 
              '\t' + combined_info.end.astype(str) + '\t' + combined_info.name) + '\n'
cnv_bt = pbt.BedTool(s, from_string=True)
cnv_bt = cnv_bt.sort()

res = cnv_bt.closest(tss_bt, sorted=True, d=True)
df = res.to_dataframe()
df = df[df.blockSizes < 1000000]
combined_info = combined_info.ix[set(df.name)]

I have to reverse the sign of LUMPY deletions because people who are het for a lumpy deletion have one less copy number than reference and people are homo alt have two less copy number than reference. All other CNVs are encoded where alts are increasing copy number.


In [6]:
cnv_lead_vars = lead_vars[lead_vars.variant_type == 'cnv']
cnv_lead_vars['cnv_id'] = cnv_lead_vars.marker_id.apply(lambda x: '_'.join(x.split('_')[2:]))
cnv_lead_vars['svtype'] = combined_info.ix[cnv_lead_vars.cnv_id, 'svtype'].values
ind = cnv_lead_vars[(cnv_lead_vars.variant_caller == 'lumpy') & 
                    (cnv_lead_vars.svtype == 'DEL')].index
cnv_lead_vars.ix[ind, 'beta'] = cnv_lead_vars.ix[ind, 'beta'] * -1

cnv_gv = gene_variant[gene_variant.variant_type == 'cnv']
cnv_gv['cnv_id'] = cnv_gv.marker_id.apply(lambda x: '_'.join(x.split('_')[2:]))

In [7]:
print('{:,} distinct CNVs we tested.'.format(combined_info.shape[0]))
a = len(set(cnv_lead_vars.gene_id))
b = len(set(cnv_lead_vars.marker_id))
print('{} eQTLs with CNV as lead variant ({} distinct CNVs).'.format(a, b))
a = cnv_gv.shape[0]
b = len(set(cnv_gv.gene_id))
c = len(set(cnv_gv.marker_id))
print('{:,} gene-CNV associations for {} genes and {} CNVs.'.format(a, b, c))


15,281 distinct CNVs we tested.
108 eQTLs with CNV as lead variant (116 distinct CNVs).
785 gene-CNV associations for 247 genes and 550 CNVs.

In [8]:
combined_info.ix[set(cnv_lead_vars.cnv_id)].svtype.value_counts()


Out[8]:
DEL      47
DUP      41
MIXED    28
Name: svtype, dtype: int64

In [9]:
cnv_gv['cnv_is_lead'] = cnv_gv.gene_id.apply(lambda x: x in cnv_lead_vars.gene_id.values)
lead_vars['cnv_is_lead'] = lead_vars.gene_id.apply(lambda x: x in cnv_lead_vars.gene_id.values)
lead_vars['cnv_sig'] = lead_vars.gene_id.apply(lambda x: x in cnv_gv.gene_id.values)

In [10]:
a = len(set(gene_variant.gene_id))
b = len(set(gene_variant.ix[gene_variant.variant_type != 'cnv', 'gene_id']))
print('{} eGenes would have been missed if CNVs were not included'.format(a - b))


64 eGenes would have been missed if CNVs were not included

In [11]:
combined_info.head()


Out[11]:
chrom start end name overlaps_gene contains_gene overlaps_gene_exon nearest_tss_dist svlen svtype variant_caller emmax
name
CNV_3_111243630_111248893 chr3 111243630 111248893 CNV_3_111243630_111248893 {ENSG00000153283.8} NaN NaN 11963 5263 DEL genomestrip True
DUP_13_114818444_114818689 chr13 114818444 114818689 DUP_13_114818444_114818689 {ENSG00000185989.9} NaN NaN 24749 245 DUP lumpy True
DEL_4_115928723_115931879 chr4 115928723 115931879 DEL_4_115928723_115931879 {ENSG00000138653.5} NaN NaN 103054 -3156 DEL lumpy True
DEL_5_30915523_30916055 chr5 30915523 30916055 DEL_5_30915523_30916055 NaN NaN NaN 137617 -532 DEL lumpy True
DEL_10_100496981_100497293 chr10 100496981 100497293 DEL_10_100496981_100497293 {ENSG00000172987.8} NaN NaN 276416 -312 DEL lumpy True

GO enrichment

fn = os.path.join(outdir, 'go_results.tsv') if not os.path.exists(fn): t = pd.Series(False, set(gene_variant.gene_id)) t[set(cnv_gv.gene_id)] = True go_res = cpb.analysis.goseq_gene_enrichment( [x.split('.')[0] for x in t.index], t.values, plot_fn=None, length_correct=False) go_res.to_csv(fn, sep='\t') else: go_res = pd.read_table(fn, index_col=0)
go_res[['over_represented_pvalue', 'term', 'over_represented_pvalue_bh']].head()

Effect sizes


In [12]:
fig,axs = plt.subplots(3, 2, figsize=(7, 10))

# Lead variants, no sig CNV
ax = axs[0, 0]
lead_vars[lead_vars.cnv_sig == False].beta.hist(ax=ax, bins=np.arange(-2, 2.1, 0.1))
ax.set_xlim(-2, 2)
ax.set_ylabel('Number of associations')
ax.set_title('Lead vars, no sig CNV for gene')

# CNV is lead variant
ax = axs[0, 1]
# For lumpy deletions, having an alt alleles means lower copy number, so I need
# to reverse the beta values.
se = cnv_lead_vars.beta
ind = cnv_lead_vars[(cnv_lead_vars.variant_caller == 'lumpy') & 
                    (cnv_lead_vars.marker_id.apply(lambda x: 'DEL' in x))].index
se[ind] = -se[ind]
se.hist(ax=ax, bins=np.arange(-2, 2.1, 0.1))
ax.set_xlim(-2, 2)
ax.set_title('CNV is lead variant')

# All CNV associations
ax = axs[1, 0]
se = cnv_gv.beta
ind = cnv_gv[(cnv_gv.variant_caller == 'lumpy') & 
             (cnv_gv.marker_id.apply(lambda x: 'DEL' in x))].index
se[ind] = -se[ind]
se.hist(ax=ax, bins=np.arange(-2, 2.1, 0.1))
ax.set_xlim(-2, 2)
ax.set_ylabel('Number of associations')
ax.set_xlabel('beta')
ax.set_title('All CNV associations')

# All CNV associations for genes where CNV is not the lead
ax = axs[1, 1]
se = cnv_gv.beta
ind = cnv_gv[(cnv_gv.variant_caller == 'lumpy') & 
             (cnv_gv.marker_id.apply(lambda x: 'DEL' in x))].index
se[ind] = -se[ind]
se = se[cnv_gv.cnv_is_lead == False]
se.hist(ax=ax, bins=np.arange(-2, 2.1, 0.1))
ax.set_xlim(-2, 2)
ax.set_xlabel('beta')
ax.set_title('CNV associations, lead not CNV')
ax = axs[2, 0]
cnv_gv.sort_values('pvalue').drop_duplicates('gene_id').beta.hist(ax=ax, bins=np.arange(-2, 2.1, 0.1))
ax.set_xlim(-2, 2)
ax.set_xlabel('beta')
ax.set_title('CNV lead variants')
ax = axs[2, 1]
cnv_gv[cnv_gv.cnv_is_lead == False].sort_values('pvalue').drop_duplicates('gene_id').beta.hist(
    ax=ax, bins=np.arange(-2, 2.1, 0.1))
ax.set_xlim(-2, 2)
ax.set_xlabel('beta')
ax.set_title('CNV lead variants, lead not CNV')
plt.tight_layout()



In [13]:
cnv_lead_vars.sort_values(by='pvalue', inplace=True)
lead_vars.sort_values(by='pvalue', inplace=True)

In [14]:
lead_vars[lead_vars.cnv_sig == False].beta.abs().hist(color='blue', alpha=0.5, 
                                                      normed=True, label='No sig CNV',
                                                      bins=np.arange(0, 3, 0.1))
cnv_lead_vars.beta.abs().hist(color='green', alpha=0.5, normed=True, label='Lead CNV',
                              bins=np.arange(0, 3, 0.1))
plt.xlabel('$|\\beta|$')
plt.ylabel('Density')
plt.legend();
s,p = stats.mannwhitneyu(lead_vars[lead_vars.cnv_sig == False].beta.abs(),
                         cnv_lead_vars.beta.abs())
print('p={}, Mann Whitney U'.format(p))


p=0.00173688834161, Mann Whitney U

In [15]:
cnv_abs_beta_pdf = pd.Series(index=np.arange(0, 3 + 0.05, 0.05))
density = stats.gaussian_kde(cnv_lead_vars.drop_duplicates('gene_id').beta.abs())
cnv_abs_beta_pdf = pd.Series(density(cnv_abs_beta_pdf.index), index=cnv_abs_beta_pdf.index)
snv_abs_beta_pdf = pd.DataFrame(index=np.arange(0, 3 + 0.005, 0.005))
density = stats.gaussian_kde(lead_vars[lead_vars.cnv_sig == False].drop_duplicates('gene_id').beta.abs())
snv_abs_beta_pdf = pd.Series(density(snv_abs_beta_pdf.index), index=snv_abs_beta_pdf.index)

In [16]:
snv_abs_beta_pdf.plot(label='No sig CNV')
cnv_abs_beta_pdf.plot(label='Lead CNV')
plt.xlabel('$|\\beta|$')
plt.ylabel('Density')
plt.legend();



In [17]:
fig,axs = plt.subplots(1, 2, figsize=(8, 4))
ax = axs[0]
lead_vars[lead_vars.cnv_is_lead == False].drop_duplicates('location').maf.hist(ax=ax)
ax.set_xlabel('MAF')
ax.set_ylabel('Number of SNV/indel lead variants')
ax = axs[1]
cnv_lead_vars.drop_duplicates('location').maf.hist(ax=ax)
ax.set_xlabel('MAF')
ax.set_ylabel('Number of lead CNVs')
plt.tight_layout()



In [18]:
fig,axs = plt.subplots(1, 2, figsize=(8, 4))
ax = axs[0]
ax.scatter(lead_vars[lead_vars.cnv_sig == False].maf, 
           lead_vars[lead_vars.cnv_sig == False].beta.abs())
ax.set_xlabel('MAF')
ax.set_ylabel('$|\\beta|$')
ax.set_title('No sig CNV')
ax = axs[1]
ax.scatter(cnv_lead_vars.maf, cnv_lead_vars.beta.abs())
ax.set_xlabel('MAF')
ax.set_ylabel('$|\\beta|$')
ax.set_title('CNV lead')
plt.tight_layout()



In [19]:
s,p = stats.mannwhitneyu(lead_vars[lead_vars.cnv_sig == False].beta,
                         cnv_lead_vars.beta)
print('Effect size for lead variants for eGenes with no lead CNV vs. '
      'lead CNVs for eGenes with lead CNV: p={:.2e}'.format(p))
s,p = stats.mannwhitneyu(lead_vars[lead_vars.cnv_sig == False].beta,
                         cnv_gv.beta)
print('Effect size for lead variants for eGenes with no lead CNV vs. '
      'all sig. CNVs for eGenes with sig CNV: p={:.2e}'.format(p))
s,p = stats.mannwhitneyu(cnv_gv[cnv_gv.cnv_is_lead == False].beta,
                         cnv_gv.beta)
print(p)
s,p = stats.mannwhitneyu(lead_vars[lead_vars.cnv_sig == False].beta,
                         cnv_gv.sort_values('pvalue').drop_duplicates('gene_id').beta)
print(p)
s,p = stats.mannwhitneyu(lead_vars[lead_vars.cnv_sig == False].beta,
                         cnv_gv[cnv_gv.cnv_is_lead == False].sort_values('pvalue').drop_duplicates('gene_id').beta)
print(p)


Effect size for lead variants for eGenes with no lead CNV vs. lead CNVs for eGenes with lead CNV: p=2.89e-19
Effect size for lead variants for eGenes with no lead CNV vs. all sig. CNVs for eGenes with sig CNV: p=3.80e-33
1.11942740191e-14
3.29663797366e-07
0.460461277576

Properties


In [20]:
s = '\n'.join(combined_info.chrom + '\t' + combined_info.start.astype(str) + 
              '\t' + combined_info.end.astype(str) + '\t' + combined_info.name) + '\n'
cnv_bt = pbt.BedTool(s, from_string=True)
cnv_bt = cnv_bt.sort()

res = cnv_bt.intersect(gene_bt, sorted=True, wo=True)
df = res.to_dataframe()
overlaps_gene = set(df.name)

cnv_gv['cnv_gene'] = cnv_gv.cnv_id + ':' + cnv_gv.gene_id
t = pd.DataFrame(True, index=df.name + ':' + df.thickEnd, columns=['cnv_overlaps_gene'])
cnv_gv = cnv_gv.merge(t, left_on='cnv_gene', right_index=True, how='left')
cnv_gv.ix[cnv_gv.cnv_overlaps_gene.isnull(), 'cnv_overlaps_gene'] = False

In [21]:
sig_cnvs = set(cnv_gv.cnv_id)
not_sig_cnvs = set(combined_info.index) - sig_cnvs

Length


In [22]:
fig,axs = plt.subplots(1, 2)
ax = axs[0]
np.log10(combined_info.ix[sig_cnvs, 'svlen']).abs().hist(ax=ax)
ax.set_title('Significant: median={:.2f}'.format(np.log10(combined_info.ix[sig_cnvs, 'svlen'].abs()).median()))
ax = axs[1]
np.log10(combined_info.ix[not_sig_cnvs, 'svlen'].abs()).hist()
ax.set_title('Not significant: median={:.2f}'.format(np.log10(combined_info.ix[not_sig_cnvs, 'svlen'].abs().median())));
plt.tight_layout()



In [23]:
s,p = stats.mannwhitneyu(combined_info.ix[sig_cnvs, 'svlen'].abs(), 
                         combined_info.ix[not_sig_cnvs, 'svlen'].abs())
print('CNVs with significant associations are longer than those without '
      'significant associations: p = {:.2e}'.format(p))
a = combined_info.ix[sig_cnvs, 'svlen'].abs().median()
b = combined_info.ix[not_sig_cnvs, 'svlen'].abs().median()
print('Sig median length = {}'.format(a))
print('Not sig median length = {}'.format(b))


CNVs with significant associations are longer than those without significant associations: p = 1.07e-75
Sig median length = 2312.5
Not sig median length = 527.0

Distance to nearest TSS


In [24]:
fig,axs = plt.subplots(1, 2)
ax = axs[0]
np.log10(combined_info.ix[sig_cnvs, 'nearest_tss_dist'].abs() + 1).hist(ax=ax)
ax.set_title('Significant')
ax = axs[1]
np.log10(combined_info.ix[not_sig_cnvs, 'nearest_tss_dist'].abs() + 1).hist()
ax.set_title('Not significant');
plt.tight_layout()


We can see that significant CNVs are typically closer to a TSS than CNVs that are not signficant.


In [25]:
a = combined_info.ix[sig_cnvs, 'nearest_tss_dist'].abs()
b = combined_info.ix[not_sig_cnvs, 'nearest_tss_dist'].abs()
s,p = stats.mannwhitneyu(a, b)
print('CNVs with significant associations are closer to a TSS than those without '
      'significant associations: p = {:.2e}'.format(p))
print('Median distance to nearest TSS for sig CNVs: {:,}'.format(a.median()))
print('Median distance to nearest TSS for not sig CNVs: {:,}'.format(b.median()))


CNVs with significant associations are closer to a TSS than those without significant associations: p = 9.91e-76
Median distance to nearest TSS for sig CNVs: 3,539.0
Median distance to nearest TSS for not sig CNVs: 12,391.0

In [26]:
fig,axs = plt.subplots(1, 2)
ax = axs[0]
#a = combined_info.ix[sig_cnvs - set(cnv_gv.ix[cnv_gv.cnv_overlaps_gene, 'cnv_id']), 'nearest_tss_dist'].abs()
a = combined_info.ix[sig_cnvs, 'nearest_tss_dist'].abs()
a = a[a != 0]
np.log10(a.abs()).hist(ax=ax)
ax.set_title('Significant, median={:.1f}'.format(np.log10(a.abs()).median()))
ax = axs[1]
b = combined_info.ix[not_sig_cnvs, 'nearest_tss_dist'].abs()
b = b[b != 0]
ax.set_title('Not significant, median={:.1f}'.format(np.log10(b.abs()).median()))
np.log10(b.abs()).hist(ax=ax);



In [27]:
#a = combined_info.ix[sig_cnvs - set(cnv_gv.ix[cnv_gv.cnv_overlaps_gene, 'cnv_id']), 'nearest_tss_dist'].abs()
a = combined_info.ix[sig_cnvs, 'nearest_tss_dist'].abs()
a = a[a != 0]
b = combined_info.ix[not_sig_cnvs, 'nearest_tss_dist'].abs()
b = b[b != 0]
s,p = stats.mannwhitneyu(a, b)
print('Median distance to nearest TSS for sig CNVs: {:,}'.format(a.median()))
print('Median distance to nearest TSS for not sig CNVs: {:,}'.format(b.median()))
print('Removing all CNVs that overlap any gene, CNVs with significant '
      'associations are closer to a TSS than those without '
      'significant associations: p = {:.2e}'.format(p))


Median distance to nearest TSS for sig CNVs: 5,451.0
Median distance to nearest TSS for not sig CNVs: 13,700.5
Removing all CNVs that overlap any gene, CNVs with significant associations are closer to a TSS than those without significant associations: p = 6.71e-40

Even if we remove CNVs that overlap genes, the significant CNVs are still closer to genes.

Gene overlap


In [28]:
overlaps_sig = len(sig_cnvs & overlaps_gene)
overlaps_not_sig = len(not_sig_cnvs & overlaps_gene)
no_overlaps_sig = len(sig_cnvs) - overlaps_sig
no_overlaps_not_sig = len(not_sig_cnvs) - overlaps_not_sig
ctable = [[overlaps_sig, overlaps_not_sig], 
          [no_overlaps_sig, no_overlaps_not_sig]]
odds,p = stats.fisher_exact(ctable)
print('CNVs with significant association are enriched for '
      'overlapping a gene p={:.1e}'.format(p))


CNVs with significant association are enriched for overlapping a gene p=5.6e-17

Intergenic CNV eQTLs

I'm going to look at CNV-eQTLs where the CNV doesn't overlap the affected gene. This doesn't mean they are necessarily intergenic because they could overlap another gene, but they don't overlap the gene they are associated with.

I used the raw CNV calls for finding eQTLs, but I'll switch here to use the combined CNVs so I can be conservative in defining whether the CNV overlaps the gene.


In [29]:
fn = os.path.join(ciepy.root, 'output', 'cnv_processing', 'combined_mapping.tsv')
mapping = pd.read_table(fn, index_col=0, header=None, squeeze=True)

For each gene, I'll define whether the gene overlaps a significant CNV. First, I'll just look at the CNVs as they are defined by Genome Strip. Then I'll include my merged CNVs. Finally, I'll merge all CNVs that are associated with the gene and see whether the gene falls in this region.


In [30]:
overlaps = set(cnv_gv.ix[cnv_gv.cnv_overlaps_gene, 'gene_id'])
t = cnv_gv[cnv_gv.gene_id.apply(lambda x: x not in overlaps)]
t = t[t.cnv_id.apply(lambda x: x in set(t.cnv_id) & set(mapping.index))]
t['combined'] = mapping[t.cnv_id].values
t = t.merge(gs_combined_info, left_on='combined', right_index=True, suffixes=['', '_combined'])
t = t.dropna(subset=['overlaps_gene'])
t = t[t.apply(lambda x: x['gene_id'] in x['overlaps_gene'], axis=1)]
overlaps |= set(t.gene_id)
t = cnv_gv[cnv_gv.gene_id.apply(lambda x: x not in overlaps)]
for gene in set(t.gene_id):
    tdf = t[t.gene_id == gene]
    start = tdf.start.min()
    end = tdf.end.max()
    if not (gene_info.ix[gene, 'start'] > end or gene_info.ix[gene, 'end'] < start):
        overlaps.add(gene)
overlaps = pd.Series(True, index=overlaps)
overlaps = pd.concat([pd.Series(False, index=list(set(cnv_gv.gene_id) - set(overlaps.index))), overlaps])
print('{} of {} genes with significant CNV eQTLs overlap the CNV.'.format(overlaps.sum(), overlaps.shape[0]))
print('{} of {} genes with significant CNV eQTLs don\'t overlap the CNV.'.format(
        sum(overlaps == False), overlaps.shape[0]))


99 of 247 genes with significant CNV eQTLs overlap the CNV.
148 of 247 genes with significant CNV eQTLs don't overlap the CNV.

In [31]:
102. / 247


Out[31]:
0.41295546558704455

In [32]:
cnv_gv['cnv_overlaps_gene_cons'] = False
cnv_gv.ix[cnv_gv.gene_id.apply(lambda x: x in overlaps[overlaps].index), 'cnv_overlaps_gene_cons'] = True

In [33]:
a = len(set(cnv_gv[(cnv_gv.cnv_is_lead) & (cnv_gv.cnv_overlaps_gene_cons)].gene_id))
b = len(set(cnv_gv[cnv_gv.cnv_is_lead].gene_id))
print('{} of {} eGenes with CNV lead variant overlap a significant CNV.'.format(a, b))


59 of 108 eGenes with CNV lead variant overlap a significant CNV.

In [34]:
cnv_gv.to_pickle(os.path.join(outdir, 'cnv_gene_variants.pickle'))
combined_info.to_pickle(os.path.join(outdir, 'combined_info.pickle'))
lead_vars.to_pickle(os.path.join(outdir, 'lead_variants.pickle'))

I'll get one significant CNV per gene with a significant CNV association.


In [35]:
cnv_gv.sort_values(by=['gene_id', 'pvalue'], inplace=True)

In [36]:
sig_genic = set(cnv_gv[cnv_gv.cnv_overlaps_gene_cons].drop_duplicates(subset=['gene_id']).cnv_id)
sig_not_genic = set(cnv_gv[cnv_gv.cnv_overlaps_gene_cons == False].drop_duplicates(subset=['gene_id']).cnv_id)
not_sig = set(combined_info.index) - set(cnv_gv.cnv_id)

In [37]:
len(sig_genic & sig_not_genic)


Out[37]:
16

There is some overlap between sig_genic and sig_not_genic because some particular CNVs are eQTLs for multiple genes and overlap some of those genes but not others.

s = set(cnv_gv.ix[cnv_gv.cnv_overlaps_gene, 'gene_id']) intergenic_cnv_gv = cnv_gv[cnv_gv.gene_id.apply(lambda x: x not in s)] sig_intergenic_cnvs = set(intergenic_cnv_gv.cnv_id) not_sig_cnvs = set(combined_info.index) - set(cnv_gv.cnv_id)

In [38]:
promoters = pbt.BedTool('/publicdata/gencode_v19_20151104/promoters_by_gene.bed')
df = promoters.to_dataframe()
df['gene'] = df.name.apply(lambda x: x.split('_')[0])
df['region'] = df.name.apply(lambda x: x.split('_')[2])
df.index = df.region
gb = df.groupby('gene')
gene_to_promoters = gb.groups

In [39]:
fig,axs = plt.subplots(1, 2, figsize=(8, 4))
tdf = cnv_gv.sort_values(by=['cnv_overlaps_gene', 'pvalue'], 
                         ascending=[False, True]).drop_duplicates(subset=['gene_id'])
a = cnv_gv[cnv_gv.cnv_overlaps_gene_cons]
b = cnv_gv[cnv_gv.cnv_overlaps_gene_cons == False]
ax = axs[0]
ax.set_ylabel('Fraction of genes')
ax.set_xlabel('Beta')
ax.set_title('Genic')
a.beta.hist(alpha=0.5, bins=np.arange(-2, 2.2, 0.2), normed=True, label='genic', ax=ax)
ax = axs[1]
b.beta.hist(alpha=0.5, bins=np.arange(-2, 2.2, 0.2), normed=True, label='intergenic', ax=ax)
ax.set_title('Intergenic')
ax.set_xlabel('Beta')
plt.tight_layout()
plt.savefig(os.path.join(outdir, 'genic_vs_intergenic_beta.pdf'))


overlaps_sig = len(sig_intergenic_cnvs & overlaps_gene) overlaps_not_sig = len(not_sig_cnvs & overlaps_gene) no_overlaps_sig = len(sig_intergenic_cnvs) - overlaps_sig no_overlaps_not_sig = len(not_sig_cnvs) - overlaps_not_sig ctable = [[overlaps_sig, overlaps_not_sig], [no_overlaps_sig, no_overlaps_not_sig]] odds,p = stats.fisher_exact(ctable) print('CNVs with significant association are enriched for ' 'overlapping a gene p={:.1e}'.format(p))

While these CNVs don't overlap the gene that they are an eQTL for, they are still enriched for overlapping some other gene.


In [40]:
roadmap_overlap = pd.concat([gs_roadmap_overlap, lumpy_roadmap_overlap])
roadmap_overlap = roadmap_overlap.ix[combined_info.index]

In [41]:
cnv_gv.to_pickle(os.path.join(outdir, 'cnv_gene_variants.pickle'))
combined_info['tss_bin'] = pd.cut(np.log10(combined_info.nearest_tss_dist.abs() + 1), bins=10) combined_info['len_bin'] = pd.cut(np.log10(combined_info.svlen.abs()), bins=10) combined_info['bin'] = combined_info.tss_bin.astype(str) + '_' + combined_info.len_bin.astype(str)

Functional annotation enrichment

Sig. genic vs. not sig.

First I'll compare significant genic CNVs to CNVs that weren't significant.


In [68]:
odds = []
pvalues = []
for c in roadmap_overlap.columns:
    vc = (roadmap_overlap.ix[sig_genic, c] > 0).value_counts()
    sig_overlap = vc[True]
    sig_not_overlap = vc[False]
    vc = (roadmap_overlap.ix[not_sig, c] > 0).value_counts()
    not_sig_overlap = vc[True]
    not_sig_not_overlap = vc[False]
    ctable = [[sig_overlap, sig_not_overlap], 
              [not_sig_overlap, not_sig_not_overlap]]
    o,p = stats.fisher_exact(ctable)
    odds.append(o)
    pvalues.append(p)
roadmap_res = pd.DataFrame({'odds':odds, 'pvalue':pvalues}, index=roadmap_overlap.columns)
colors = pd.DataFrame(zip(list(set([x.split('_')[-1] for x in roadmap_res.index])), 
                          cpb.analysis.tableau20[::2]), columns=['mark', 'color'])
roadmap_res['mark'] = [x.split('_')[-1] for x in roadmap_res.index]
roadmap_res['ind'] = roadmap_res.index
roadmap_res = roadmap_res.merge(colors)
roadmap_res.index = roadmap_res.ind
roadmap_res['log_odds'] = np.log2(roadmap_res.odds)
roadmap_res['neg_log_pvalue'] = -np.log10(roadmap_res.pvalue)

roadmap_res.to_csv(os.path.join(outdir, 'sig_genic_vs_not_sig_roadmap_res.tsv'), sep='\t')

sns.stripplot(y='mark', x='neg_log_pvalue', data=roadmap_res, jitter=0.3, orient='h');
sns.lmplot(x='log_odds', y='neg_log_pvalue', data=roadmap_res, hue='mark', fit_reg=False);


We can see enrichment for various active regulatory marks, particularly H3K36me3 which is associated with the transcribed parts of genes.

Sig. not genic vs. genic sig.

Now let's compare significant genic versus significant not genic CNVs.


In [70]:
odds = []
pvalues = []
for c in roadmap_overlap.columns:
    vc = (roadmap_overlap.ix[sig_not_genic, c] > 0).value_counts()
    sig_overlap = vc[True]
    sig_not_overlap = vc[False]
    vc = (roadmap_overlap.ix[sig_genic, c] > 0).value_counts()
    not_sig_overlap = vc[True]
    not_sig_not_overlap = vc[False]
    ctable = [[sig_overlap, sig_not_overlap], 
              [not_sig_overlap, not_sig_not_overlap]]
    o,p = stats.fisher_exact(ctable)
    odds.append(o)
    pvalues.append(p)
roadmap_res = pd.DataFrame({'odds':odds, 'pvalue':pvalues}, index=roadmap_overlap.columns)
colors = pd.DataFrame(zip(list(set([x.split('_')[-1] for x in roadmap_res.index])), 
                          cpb.analysis.tableau20[::2]), columns=['mark', 'color'])
roadmap_res['mark'] = [x.split('_')[-1] for x in roadmap_res.index]
roadmap_res['ind'] = roadmap_res.index
roadmap_res = roadmap_res.merge(colors)
roadmap_res.index = roadmap_res.ind
roadmap_res['log_odds'] = np.log2(roadmap_res.odds)
roadmap_res['neg_log_pvalue'] = -np.log10(roadmap_res.pvalue)

roadmap_res.to_csv(os.path.join(outdir, 'sig_not_genic_vs_sig_genic_roadmap_res.tsv'), sep='\t')

sns.stripplot(y='mark', x='neg_log_pvalue', data=roadmap_res, jitter=0.3, orient='h');
sns.lmplot(x='log_odds', y='neg_log_pvalue', data=roadmap_res, hue='mark', fit_reg=False);


We can see that the genic CNVs are enriched for H3K36me3 relative to the not genic CNVs. So the not genic CNVs are noticeably different.

Sig. not genic vs. not sig.


In [71]:
odds = []
pvalues = []
for c in roadmap_overlap.columns:
    vc = (roadmap_overlap.ix[sig_not_genic, c] > 0).value_counts()
    sig_overlap = vc[True]
    sig_not_overlap = vc[False]
    vc = (roadmap_overlap.ix[not_sig, c] > 0).value_counts()
    not_sig_overlap = vc[True]
    not_sig_not_overlap = vc[False]
    ctable = [[sig_overlap, sig_not_overlap], 
              [not_sig_overlap, not_sig_not_overlap]]
    o,p = stats.fisher_exact(ctable)
    odds.append(o)
    pvalues.append(p)
roadmap_res = pd.DataFrame({'odds':odds, 'pvalue':pvalues}, index=roadmap_overlap.columns)
colors = pd.DataFrame(zip(list(set([x.split('_')[-1] for x in roadmap_res.index])), 
                          cpb.analysis.tableau20[::2]), columns=['mark', 'color'])
roadmap_res['mark'] = [x.split('_')[-1] for x in roadmap_res.index]
roadmap_res['ind'] = roadmap_res.index
roadmap_res = roadmap_res.merge(colors)
roadmap_res.index = roadmap_res.ind
roadmap_res['log_odds'] = np.log2(roadmap_res.odds)
roadmap_res['neg_log_pvalue'] = -np.log10(roadmap_res.pvalue)

roadmap_res.to_csv(os.path.join(outdir, 'sig_not_genic_vs_not_sig_roadmap_res.tsv'), sep='\t')

sns.stripplot(y='mark', x='neg_log_pvalue', data=roadmap_res, jitter=0.3, orient='h');
sns.lmplot(x='log_odds', y='neg_log_pvalue', data=roadmap_res, hue='mark', fit_reg=False);


We see enrichment for various active regulatory marks. So these CNVs are hitting regulatory regions.


In [72]:
plt.figure(figsize=(7, 11))
(np.log2(roadmap_res.odds)).plot(kind='barh', color=roadmap_res.color)
plt.tight_layout()
plt.xlabel('$\log_2$ odds')
plt.ylabel('')
plt.tight_layout()
plt.savefig(os.path.join(outdir, 'intergenic_histone_mark_odds.pdf'))



In [73]:
plt.figure(figsize=(7, 11))
(-np.log10(roadmap_res.pvalue)).plot(kind='barh', color=roadmap_res.color)
plt.xlabel('$-\log_{10}$ $p$-value')
plt.tight_layout()


odds = [] pvalues = [] for c in encode_dnase_overlap.columns: vc = (encode_dnase_overlap.ix[sig_intergenic_cnvs, c] > 0).value_counts() sig_overlap = vc[True] sig_not_overlap = vc[False] vc = (encode_dnase_overlap.ix[not_sig_cnvs, c] > 0).value_counts() not_sig_overlap = vc[True] not_sig_not_overlap = vc[False] ctable = [[sig_overlap, sig_not_overlap], [not_sig_overlap, not_sig_not_overlap]] o,p = stats.fisher_exact(ctable) odds.append(o) pvalues.append(p) encode_dnase_res = pd.DataFrame({'odds':odds, 'pvalue':pvalues}, index=encode_dnase_overlap.columns)
(-np.log10(encode_dnase_res.pvalue)).plot(kind='barh') plt.tight_layout()
odds = [] pvalues = [] for c in encode_chip_overlap.columns: vc = (encode_chip_overlap.ix[sig_intergenic_cnvs, c] > 0).value_counts() if True not in vc.index: sig_overlap = 0 else: sig_overlap = vc[True] sig_not_overlap = vc[False] vc = (encode_chip_overlap.ix[not_sig_cnvs, c] > 0).value_counts() if True not in vc.index: not_sig_overlap = 0 else: not_sig_overlap = vc[True] not_sig_not_overlap = vc[False] ctable = [[sig_overlap, sig_not_overlap], [not_sig_overlap, not_sig_not_overlap]] o,p = stats.fisher_exact(ctable) odds.append(o) pvalues.append(p) encode_chip_res = pd.DataFrame({'odds':odds, 'pvalue':pvalues}, index=encode_chip_overlap.columns) encode_chip_res.sort_values(by='pvalue', inplace=True, ascending=False)